{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
      "  return f(*args, **kwds)\n",
      "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
      "  return f(*args, **kwds)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from tqdm import tqdm\n",
    "from sklearn.preprocessing import LabelBinarizer,LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '../input/'\n",
    "train = pd.read_csv(path + 'final_train.csv')\n",
    "test = pd.read_csv(path + 'final_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((89806693, 5), (79288375, 4))"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape,test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_api = train['api'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(295,)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique_api.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "api2index = {item:(i+1) for i,item in enumerate(unique_api)}\n",
    "index2api = {(i+1):item for i,item in enumerate(unique_api)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['api_idx'] = train['api'].map(api2index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "test['api_idx'] = test['api'].map(api2index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_period_idx = train.file_id.drop_duplicates(keep='first').index.values\n",
    "test_period_idx = test.file_id.drop_duplicates(keep='first').index.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_sequence(df,period_idx):\n",
    "    seq_list = []\n",
    "    for _id,begin in enumerate(period_idx[:-1]):\n",
    "        seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)\n",
    "    seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)\n",
    "    return seq_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = train[['file_id','label']].drop_duplicates(keep='first')\n",
    "test_df = test[['file_id']].drop_duplicates(keep='first')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['seq'] = get_sequence(train,train_period_idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['seq'] = get_sequence(test,test_period_idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(19350.97816934013, 6466.961402750774, 888204)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.seq.map(lambda x: len(x)).std(),train_df.seq.map(lambda x: len(x)).mean(),train_df.seq.map(lambda x: len(x)).max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15911.676663585444, 6120.291393284446, 769590)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.seq.map(lambda x: len(x)).std(),test_df.seq.map(lambda x: len(x)).mean(),test_df.seq.map(lambda x: len(x)).max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/enjoy/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
      "  return f(*args, **kwds)\n",
      "/home/enjoy/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional\n",
    "from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten\n",
    "from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D\n",
    "from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average\n",
    "from keras.models import Model\n",
    "from keras.optimizers import RMSprop,Adam\n",
    "from keras.layers.normalization import BatchNormalization\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "from keras.optimizers import SGD\n",
    "from keras import backend as K\n",
    "from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation\n",
    "from keras.layers import SpatialDropout1D\n",
    "from keras.layers.wrappers import Bidirectional"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def TextCNN(max_len,max_cnt,embed_size,\n",
    "            num_filters,kernel_size,\n",
    "            conv_action,\n",
    "            mask_zero):\n",
    "    _input = Input(shape=(max_len,), dtype='int32')\n",
    "    _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)\n",
    "    _embed = SpatialDropout1D(0.15)(_embed)\n",
    "    warppers = []\n",
    "    for _kernel_size in kernel_size:\n",
    "        conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action)(_embed)\n",
    "        warppers.append(GlobalMaxPooling1D()(conv1d))\n",
    "                        \n",
    "    fc = concatenate(warppers)\n",
    "    fc = Dropout(0.5)(fc)\n",
    "    #fc = BatchNormalization()(fc)\n",
    "    fc = Dense(256, activation='relu')(fc)\n",
    "    fc = Dropout(0.25)(fc)\n",
    "    #fc = BatchNormalization()(fc) \n",
    "    preds = Dense(8, activation = 'softmax')(fc)\n",
    "    \n",
    "    model = Model(inputs=_input, outputs=preds)\n",
    "    \n",
    "    model.compile(loss='categorical_crossentropy',\n",
    "        optimizer='adam',\n",
    "        metrics=['accuracy'])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_labels = pd.get_dummies(train_df.label).values\n",
    "train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)\n",
    "test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_len = 6000\n",
    "max_cnt = 295\n",
    "embed_size = 256\n",
    "num_filters = 64\n",
    "kernel_size = [2,4,6,8,10,12,14]\n",
    "conv_action = 'relu'\n",
    "mask_zero = False\n",
    "TRAIN = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FOLD: \n",
      "2780 11107\n",
      "2780/2780 [==============================] - 5s 2ms/step\n",
      "12955/12955 [==============================] - 18s 1ms/step\n",
      "FOLD: \n",
      "2779 11108\n",
      "2779/2779 [==============================] - 4s 2ms/step\n",
      "12955/12955 [==============================] - 18s 1ms/step\n",
      "FOLD: \n",
      "2777 11110\n",
      "2777/2777 [==============================] - 4s 2ms/step\n",
      "12955/12955 [==============================] - 18s 1ms/step\n",
      "FOLD: \n",
      "2776 11111\n",
      "2776/2776 [==============================] - 4s 2ms/step\n",
      "12955/12955 [==============================] - 18s 1ms/step\n",
      "FOLD: \n",
      "2775 11112\n",
      "2775/2775 [==============================] - 5s 2ms/step\n",
      "12955/12955 [==============================] - 19s 1ms/step\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
    "meta_train = np.zeros(shape = (len(train_seq),8))\n",
    "meta_test = np.zeros(shape = (len(test_seq),8))\n",
    "FLAG = False\n",
    "for i,(tr_ind,te_ind) in enumerate(skf):\n",
    "    print('FOLD: '.format(i))\n",
    "    print(len(te_ind),len(tr_ind))\n",
    "    model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)\n",
    "    model_name = 'benchmark_textcnn_fold_'+str(i)\n",
    "    X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]\n",
    "    X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]\n",
    "    \n",
    "    model = TextCNN(max_len,max_cnt,embed_size,\n",
    "            num_filters,kernel_size,\n",
    "            conv_action,\n",
    "            mask_zero)\n",
    "    \n",
    "    model_save_path = '../model_weight_final/%s_%s.hdf5'%(model_name,embed_size)\n",
    "    early_stopping =EarlyStopping(monitor='val_loss', patience=3)\n",
    "    model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)\n",
    "    if TRAIN and FLAG:\n",
    "        model.fit(X_train,X_train_label,\n",
    "                  validation_data=(X_val,X_val_label),\n",
    "                  epochs=100,batch_size=64,\n",
    "                  shuffle=True,\n",
    "                  callbacks=[early_stopping,model_checkpoint]\n",
    "                 )\n",
    "    \n",
    "    model.load_weights(model_save_path)\n",
    "    pred_val = model.predict(X_val,batch_size=128,verbose=1)\n",
    "    pred_test = model.predict(test_seq,batch_size=128,verbose=1)\n",
    "    \n",
    "    meta_train[te_ind] = pred_val\n",
    "    meta_test += pred_test\n",
    "    K.clear_session()\n",
    "meta_test /= 5.0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_pickle(meta_train,'../train_meta_cnn.pkl')\n",
    "pd.to_pickle(meta_test,'../test_meta_cnn.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/home/enjoy/tianchi/安全赛复赛/src'"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
